### This R script is to make figure 4

# open necessary packages
library(tidyverse)
library(dotwhisker)
library(fixest)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_1 <- read.csv("core_19.csv")
data_2 <- read.csv("core_23.csv")
data <- list(data_1,data_2) 
names(data) <- paste("data_",c(1,2),sep="")
rm(data_1); rm(data_2) # drop individual data objects

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
if(is.null(data$data_1$cdis1) & !is.null(data$data_1$cdis)){
  data$data_1 <- data$data_1 %>% rename(cdis1=cdis)
}
if(is.null(data$data_2$cdis1) & !is.null(data$data_2$cdis)){
  data$data_2 <- data$data_2 %>% rename(cdis1=cdis)
}

## import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

# add simple_avg to varnames
varnames <- c(varnames,"simple_avg")

### 1) Split the data by primary mode of instruction during the 2020-21 school year:
### The learning format variables show percent of time spent in each format, so to figure 
### out primary mode of instruction, use the maximum of (share_inperson, share_hybrid and share_virtual)

### Drop schools without learning format variables:
data <- lapply(data, function(x) filter(x, !is.na(share_inperson)))

### Make primary mode of instruction dummies for data period 1:
learning_format_vars <- select(data$data_1, share_inperson, share_hybrid, share_virtual)
learning_format_vars$max_names <- apply(learning_format_vars,1,function(x) names(which.max(x)))
summary(learning_format_vars)

data$data_1$primary_inperson <- ifelse(learning_format_vars$max_names=="share_inperson", 1, 0)
data$data_1$primary_hybrid <- ifelse(learning_format_vars$max_names=="share_hybrid", 1, 0)
data$data_1$primary_virtual <- ifelse(learning_format_vars$max_names=="share_virtual", 1, 0)
a <- which(colnames(data$data_1)=="primary_inperson"); b <- which(colnames(data$data_1)=="primary_virtual")
summary(rowSums(data$data_1[,a:b])) # checking that dummies sum to one


### Make primary mode of instruction dummies for data period 2:
learning_format_vars <- select(data$data_2, share_inperson, share_hybrid, share_virtual)
learning_format_vars$max_names <- apply(learning_format_vars,1,function(x) names(which.max(x)))

data$data_2$primary_inperson <- ifelse(learning_format_vars$max_names=="share_inperson", 1, 0)
data$data_2$primary_hybrid <- ifelse(learning_format_vars$max_names=="share_hybrid", 1, 0)
data$data_2$primary_virtual <- ifelse(learning_format_vars$max_names=="share_virtual", 1, 0)
a <- which(colnames(data$data_2)=="primary_inperson"); b <- which(colnames(data$data_2)=="primary_virtual")
summary(rowSums(data$data_2[,a:b])) # checking that dummies sum to one

### 2) Get differences across time for primarily in person schools ------------------------
## Label the weights you'll use for each year as "weight"
data$data_1 <- mutate(data$data_1, weight=teacher_n)
data$data_2 <- mutate(data$data_2, weight=teacher_n)

## Split data:
inperson <- lapply(data, function(x) filter(x, primary_inperson==1))
hybrid <- lapply(data, function(x) filter(x, primary_hybrid==1))
virtual <- lapply(data, function(x) filter(x, primary_virtual==1))

## Next write a function that calculates weighted average
weighted_avg <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(weighted.mean(x[[var]],x[["weight"]], na.rm=T))
  } else{return(NA)}}

# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(inperson, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from data period 1 - data period 2
Table <- matrix(NA, length(varnames), 3)
Table <- as.data.frame(Table)
colnames(Table) <- c("inperson", "hybrid", "virtual") 
rownames(Table) <- rownames(weighted_scores)

Table$inperson <- weighted_scores$data_2 - weighted_scores$data_1

### 3) Get differences across time for primarily hybrid schools ------------------------
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(hybrid, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from data period 1 - data period 2
Table$hybrid <- weighted_scores$data_2 - weighted_scores$data_1

### 4) Get differences across time for primarily virtual schools ------------------------
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(virtual, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from data period 1 - data period 2
Table$virtual <- weighted_scores$data_2 - weighted_scores$data_1

### 5) statistical significance ------------------------

## write function to run statistical significance test:
# Score(it) = a0 + b1*year_2(t) + b2*Hybrid(i) + b3*Online(i) + b4*(year_2(t)*Hybrid(i)) + b5*(year_2(t)*Online(i)) + u(it)

# weight by "weight" and cluster by district

# make second data period dummy:
data$data_1$year_2 <- 0
data$data_2$year_2 <- 1

# data_1 is the earlier year, data_2 is the later year
learning_format_statsig <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # clause that gives an NA if the variable is not available in one of the years
    output <- c(NA,NA,NA,NA); names(output) <- c("coefficient_B4_hybrid","p-value_B4_hybrid",
                                           "coefficient_B5_virtual","p-value_B5_virtual")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]], year_2=data_1[["year_2"]], 
                          primary_hybrid=data_1[["primary_hybrid"]], primary_virtual=data_1[["primary_virtual"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]], year_2=data_2[["year_2"]], 
                          primary_hybrid=data_2[["primary_hybrid"]], primary_virtual=data_2[["primary_virtual"]],leaid=data_2[["leaid"]])
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ year_2*(primary_hybrid + primary_virtual), cluster=~leaid, data=stack, weights=~weight)
    coeff_B4 <- coef(regression)["year_2:primary_hybrid"]
    p_val_B4 <- pvalue(regression)["year_2:primary_hybrid"]
    coeff_B5 <- coef(regression)["year_2:primary_virtual"]
    p_val_B5 <- pvalue(regression)["year_2:primary_virtual"]
    
    output <- c(coeff_B4,p_val_B4,coeff_B5,p_val_B5); names(output) <- c("coefficient_B4_hybrid","p-value_B4_hybrid",
                                                                         "coefficient_B5_virtual","p-value_B5_virtual")}
  return(output)
}

# check work - it worked if B4 == difference between changes for primarily in-person and hybrid schools
# and similarly with B5 and primarily virtual schools
test <- learning_format_statsig(data$data_1, data$data_2, "colb")
diff_hybrid_inperson <- Table["colb","hybrid"]-Table["colb","inperson"]
diff_virtual_inperson <- Table["colb","virtual"]-Table["colb","inperson"]
test["coefficient_B4_hybrid"]-diff_hybrid_inperson # should be very small amount
test["coefficient_B5_virtual"]-diff_virtual_inperson # should be very small amount

### get p-vals for each variable
Table_p_vals <- matrix(NA, nrow(Table), 2)
Table_p_vals <- as.data.frame(Table_p_vals)
colnames(Table_p_vals) <- c("hybrid","virtual")
rownames(Table_p_vals) <- rownames(Table)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_p_vals$hybrid[i] <- learning_format_statsig(data$data_1, data$data_2, var)["p-value_B4_hybrid"]
  Table_p_vals$virtual[i] <- learning_format_statsig(data$data_1, data$data_2, var)["p-value_B5_virtual"]
}


### Check work on statistical significance tests and calculate statistical significance at 5% level:
Table_p_vals>1
Table_p_vals<0
stat_sig <- Table_p_vals<=.05
stat_sig <- as.data.frame(stat_sig)

### 6) Combine differences and statistical significance into single table:

### scale means by 20 and drop 'data' variable
Table <- Table/20
Table <- Table[-which(rownames(Table)=='data'),]
stat_sig <- stat_sig[-which(rownames(stat_sig)=='data'),]

# make a dataframe with asterisks where result is statistically significant:
asterisk <- matrix(NA, nrow(stat_sig), ncol(stat_sig))
asterisk <- as.data.frame(asterisk)
colnames(asterisk) <- colnames(stat_sig)
rownames(asterisk) <- rownames(stat_sig)

for(j in 1:ncol(stat_sig)){
  for(i in 1:nrow(stat_sig)){
    if(is.na(stat_sig[i,j])){asterisk[i,j] <- ""}
    else if(stat_sig[i,j]){asterisk[i,j] <- "*"}
    else{asterisk[i,j] <- ""}
  }
}
asterisk; stat_sig

Table_final <- matrix(NA, nrow(Table), ncol(Table))
Table_final <- as.data.frame(Table_final)
colnames(Table_final) <- colnames(Table)
rownames(Table_final) <- rownames(Table)

# round differences and combine with asterisks:
Table_rounded <- round(Table,2)
Table_final["inperson"] <- Table_rounded["inperson"]
Table_final[,"hybrid"] <- sprintf("%s%s", Table_rounded[,"hybrid"], asterisk[,"hybrid"])
Table_final[,"virtual"] <- sprintf("%s%s", Table_rounded[,"virtual"], asterisk[,"virtual"])

# 7) Get a table of standard errors for the difference across time for primarily in-person, hybrid and virtual schools

## write function for regression:
# For regression on construct c and difference over 2019-2023,
# a) Stack 2019 and 2023 data
# b) Run regressions of the following form for each variable:

# Y_ict = B0 + B1*T_t + epsilon_ict
# Where i = school, c = construct (survey variable), t = time 
# T is a year dummy (= 1 if data is from 2023)
# run the regressions as weighted (GLS) regressions and cluster by district

# c) store the std error on B1

# data_1 is the earlier year, data_2 is the later year
wtd.reg <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # clause that gives an NA if the variable is not available in one of the years
    output <- c(NA,NA); names(output) <- c("coefficient","std error")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]],leaid=data_2[["leaid"]])
    stack_1$later_year_dummy <- 0
    stack_2$later_year_dummy <- 1
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ later_year_dummy,  cluster=~leaid, data=stack, weights=~weight)
    coeff_B1 <- coef(regression)["later_year_dummy"]
    std_error_B1 <- se(regression)["later_year_dummy"]
    output <- c(coeff_B1,std_error_B1); names(output) <- c("coefficient","std error")}
  return(output)
}

# check work
test <- wtd.reg(inperson$data_1, inperson$data_2, "colb")
scaled_diff_in_means <- Table$inperson[which(rownames(Table)=="colb")]
test[1]/20 - scaled_diff_in_means # should be very small amount

### get std error for each variable and difference - fill in std errors by column of table 6
Table_std_errors <- matrix(NA, nrow(Table), ncol(Table))
Table_std_errors <- as.data.frame(Table_std_errors)
colnames(Table_std_errors) <- colnames(Table)
rownames(Table_std_errors) <- rownames(Table)
varnames <- varnames[-which(varnames=="data")]

# inperson
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$inperson[i] <- wtd.reg(inperson$data_1, inperson$data_2, var)[2]
}

# hybrid
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$hybrid[i] <- wtd.reg(hybrid$data_1, hybrid$data_2, var)[2]
}

# virtual
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$virtual[i] <- wtd.reg(virtual$data_1, virtual$data_2, var)[2]
}

Table_std_errors

# check work
Table_std_errors<0

### Divide by 20 
Table_std_errors <- Table_std_errors/20

# 8) Make dot and whisker plot
# need columns 'term', 'estimate', 'std.error' and 'model'
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.definition
varnames <- varnames[!is.na(varnames)]
varnames <- varnames[-which(varnames=="Collective Use of Assessment Data")]
varnames <- c(varnames, "Simple Average")

# combine in-person, hybrid and virtual values
plot.data.inperson <- data.frame(term=varnames,estimate=Table$inperson,
                                         std.error=Table_std_errors$inperson,model="In-Person")
plot.data.hybrid <- data.frame(term=varnames,estimate=Table$hybrid,
                                 std.error=Table_std_errors$hybrid,model="Hybrid")
plot.data.virtual <- data.frame(term=varnames,estimate=Table$virtual,
                                 std.error=Table_std_errors$virtual,model="Virtual")

plot.data <- rbind(plot.data.inperson,plot.data.hybrid,plot.data.virtual)
plot.data$color <- ifelse(plot.data$estimate>=0,"positive","negative")

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("In-Person","Hybrid","Virtual")),
             ncol=ncol(Table))+theme(legend.position="none") 

# add asterisks to variable names for statistical significance between in person and hybrid groups
# and carats for significance between in person and virtual
plot.data.backup <- plot.data
asterisk.backup <- asterisk

for(i in 1:nrow(stat_sig)){
  if(is.na(stat_sig[i,"virtual"])){asterisk[i,"virtual"] <- ""}
  else if(stat_sig[i,"virtual"]){asterisk[i,"virtual"] <- "^"}
  else{asterisk[i,"virtual"] <- ""}
}

plot.data$term <- sprintf("%s %s%s", varnames, asterisk$hybrid, asterisk$virtual)
dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("In-Person","Hybrid","Virtual")),
             ncol=ncol(Table))+theme(legend.position="none") 
# exported as 960x620

